Apache License 2.0


Needed packages

library(raster)

Source

We start with two source lines and right after them the code from those two scripts.

source("src/RMSE.r")

Calculate the Root Mean Squared Error.

RMSE = function(truth, prediction)
{
    return(sqrt(mean((truth-prediction)^2, na.rm=TRUE)))
}

Calculate partial RMSE for different zones by generating squared difference rasters.

GetDifferenceRaster = function(former, latter, filename=paste("data/", names(latter), ".grd", sep=""), force=FALSE)
{
    # Figure out a reasonable output name
    if (filename == "data/layer.grd")
        warning("Please pass filename or assign meaningful names to input rasters! Else the loaded data may not match expectations.")
    
    # Create one or load one
    if (!file.exists(filename) || force)
    {
        return(overlay(former, latter, fun=function(truth, prediction){(truth-prediction)^2},
            filename=filename))
    }
    else
        return(raster(filename))
}

Calculate RMSE per zone.

StratifiedRMSE = function(truth, prediction, zones, zonenames = "", ...)
{
    # Unfortunately, zonal() just passes a vector of numbers, not a matrix.
    # So we have to calclate RMSE manually from a difference raster.
    differenceRaster = GetDifferenceRaster(truth, prediction, ...)
    
    # Get a mean from the zones in the difference raster
    zonestats = zonal(differenceRaster, zones, fun="mean")
    
    # Add nice labels, if we have them
    if (length(zonenames) > 1)
        rownames(zonestats) = zonenames
        
    return(zonestats)
}

The second source file starts here.

source("src/LinearModel.r")

Creates a linear model and prints its summary, and if desired whether some covariates ought to be removed from the model, and a plot of residuals.

lmValidation = function(..., printstep=FALSE, printplot=FALSE)
{
    LM = lm(...)
    print(summary(LM))
    if (printstep)
        step(LM)
    if (printplot)
        plot(LM)
    return(LM)
}

# Creates a prediction based on the model, clamps values to a given range, and
# if requested plots a histogram and/or a comparison plot between the layers
predictValidation = function(object, ..., truthlayer = "", range = c(-Inf, +Inf), plothist = FALSE, plotcomparison = FALSE)
{
    Prediction = predict(object, ...)
    Prediction[Prediction < range[1]] = NA
    Prediction[Prediction > range[2]] = NA
    if (plothist)
        hist(Prediction, breaks = 200, main="Histogram of predicted values")
    if (plotcomparison)
    {
        TruthExists = try(class(object[[truthlayer]]))
        if (class(TruthExists) == "try-error")
        {
            warning("Requested a comparison plot, but could not find a layer to compare to!")
            return(Prediction)
        }
        op = par(mfrow=c(1,2))
        plot(Prediction, colNA="black", main="Predicted values")
        plot(object[[truthlayer]], colNA="black", main="Ground truth")
        par(op)
    }
    return(Prediction)
}

Download/load information

GetFromWURgit = function(filename)
{
    download.file(paste("https://github.com/GeoScripting-WUR/AdvancedRasterAnalysis/raw/gh-pages/data/", filename, sep=""),
        paste("data/", filename, sep=""), "wget")
    return(paste("data/", filename, sep=""))
}
load(GetFromWURgit("GewataB1.rda"))
load(GetFromWURgit("GewataB2.rda"))
load(GetFromWURgit("GewataB3.rda"))
load(GetFromWURgit("GewataB4.rda"))
load(GetFromWURgit("GewataB5.rda"))
load(GetFromWURgit("GewataB7.rda"))
load(GetFromWURgit("vcfGewata.rda"))
load(GetFromWURgit("trainingPoly.rda"))  

DataBrick = brick(GewataB1, GewataB2, GewataB3, GewataB4, GewataB5, GewataB7, vcfGewata)
names(DataBrick) = c("Blue", "Green", "Red", "NIR", "SWIR", "Emission", "VCF")

Sanitise data

DataBrick[["VCF"]][DataBrick[["VCF"]] > 100] = NA
DataValues = as.data.frame(getValues(DataBrick))

Create a training raster for RMSE

trainingRaster = rasterize(trainingPoly, DataBrick[["VCF"]], field='Class', filename="data/trainingRaster.grd", overwrite=TRUE)
## Found 16 region(s) and 16 polygon(s)

Here we produce plots that demonstrate the relationship between the Landsat bands and the VCF tree cover.

pairs(DataBrick)

From this we can conclude that they are all negatively correlated with VCF, except for NIR. The bands are also highly correlated with each other.


Here we create a lm() model and show a summary with our own lmValidation()

LM = lmValidation(VCF ~ Blue + Green + Red + NIR + SWIR + Emission, data=DataValues, printstep = TRUE)
## 
## Call:
## lm(formula = ..1, data = ..2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.205  -4.636   0.715   5.211 258.436 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  8.549e+01  5.723e-02 1493.805   <2e-16 ***
## Blue         9.291e-02  1.889e-04  491.817   <2e-16 ***
## Green       -1.505e-01  2.353e-04 -639.622   <2e-16 ***
## Red         -2.528e-03  1.698e-04  -14.889   <2e-16 ***
## NIR          1.661e-02  2.790e-05  595.378   <2e-16 ***
## SWIR        -2.006e-02  7.036e-05 -285.119   <2e-16 ***
## Emission     2.549e-05  8.969e-05    0.284    0.776    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.606 on 1808277 degrees of freedom
##   (13712 observations deleted due to missingness)
## Multiple R-squared:  0.8582, Adjusted R-squared:  0.8582 
## F-statistic: 1.824e+06 on 6 and 1808277 DF,  p-value: < 2.2e-16
## 
## Start:  AIC=7784664
## VCF ~ Blue + Green + Red + NIR + SWIR + Emission
## 
##            Df Sum of Sq       RSS     AIC
## - Emission  1         6 133937265 7784662
## <none>                  133937259 7784664
## - Red       1     16420 133953679 7784884
## - SWIR      1   6021286 139958546 7864181
## - Blue      1  17916072 151853332 8011681
## - NIR       1  26255617 160192877 8108358
## - Green     1  30302856 164240115 8153476
## 
## Step:  AIC=7784662
## VCF ~ Blue + Green + Red + NIR + SWIR
## 
##         Df Sum of Sq       RSS     AIC
## <none>               133937265 7784662
## - Red    1     21399 133958664 7784949
## - SWIR   1  17210900 151148165 8003262
## - Blue   1  18173224 152110489 8014738
## - Green  1  31342888 165280153 8164889
## - NIR    1  34235262 168172528 8196260

Emission can be dropped

LM = lmValidation(VCF ~ Blue + Green + Red + NIR + SWIR, data=DataValues, printstep = TRUE)
## 
## Call:
## lm(formula = ..1, data = ..2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.204  -4.636   0.715   5.211 258.586 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.550e+01  5.601e-02  1526.5   <2e-16 ***
## Blue         9.291e-02  1.876e-04   495.3   <2e-16 ***
## Green       -1.505e-01  2.314e-04  -650.5   <2e-16 ***
## Red         -2.504e-03  1.473e-04   -17.0   <2e-16 ***
## NIR          1.661e-02  2.443e-05   679.9   <2e-16 ***
## SWIR        -2.004e-02  4.158e-05  -482.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.606 on 1808278 degrees of freedom
##   (13712 observations deleted due to missingness)
## Multiple R-squared:  0.8582, Adjusted R-squared:  0.8582 
## F-statistic: 2.189e+06 on 5 and 1808278 DF,  p-value: < 2.2e-16
## 
## Start:  AIC=7784662
## VCF ~ Blue + Green + Red + NIR + SWIR
## 
##         Df Sum of Sq       RSS     AIC
## <none>               133937265 7784662
## - Red    1     21399 133958664 7784949
## - SWIR   1  17210900 151148165 8003262
## - Blue   1  18173224 152110489 8014738
## - Green  1  31342888 165280153 8164889
## - NIR    1  34235262 168172528 8196260

Nothing further can be dropped, the most significant bands are NIR and green.
NB: a linear model isn’t very appropriate, because normality and independence assumptions are violated!

BrickSubset = dropLayer(DataBrick, "Emission")

Here we predict the values and plot a histogram using the predictValidation().

Prediction = predictValidation(BrickSubset, model=LM, na.rm=TRUE, plothist=TRUE)

In the histogram we see that some values are out of range, we apply range constraints to the prediction analysis.
The results are plotted below.

Prediction = predictValidation(BrickSubset, model=LM, na.rm=TRUE, overwrite=TRUE, filename="data/PredictionRaster.grd",
    truthlayer="VCF", range=c(0, +Inf), plotcomparison=TRUE)

names(Prediction) = "Predicted.LM"

Compute the RMSE:

RMSE(getValues(DataBrick[["VCF"]]), getValues(Prediction))
## [1] 8.35677

RMSE per zone:

zonestats = StratifiedRMSE(DataBrick[["VCF"]], Prediction, trainingRaster, zonenames=levels(trainingPoly@data$Class))
zonestats
##          zone      mean
## cropland    1  78.55089
## forest      2  24.87072
## wetland     3 141.92961

As a bonus, plot the difference raster:

plot(raster("data/Predicted.LM.grd"))


Here we repeat the analysis and only use independent variables:

ReducedBrick = dropLayer(DataBrick, "Emission")
ReducedBrick = dropLayer(ReducedBrick, "Green")
ReducedBrick = dropLayer(ReducedBrick, "Red")
ReducedBrick = dropLayer(ReducedBrick, "SWIR")

pairs(ReducedBrick)

ReducedLM = lmValidation(VCF ~ Blue + NIR, data=DataValues, printstep=TRUE)
## 
## Call:
## lm(formula = ..1, data = ..2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -76.341  -9.468   1.198  10.277 172.619 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.122e+02  9.092e-02    1234   <2e-16 ***
## Blue        -2.005e-01  1.290e-04   -1555   <2e-16 ***
## NIR          8.266e-03  3.017e-05     274   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.73 on 1808281 degrees of freedom
##   (13712 observations deleted due to missingness)
## Multiple R-squared:  0.5847, Adjusted R-squared:  0.5847 
## F-statistic: 1.273e+06 on 2 and 1808281 DF,  p-value: < 2.2e-16
## 
## Start:  AIC=9728271
## VCF ~ Blue + NIR
## 
##        Df Sum of Sq       RSS      AIC
## <none>              392372126  9728271
## - NIR   1  16289145 408661271  9801822
## - Blue  1 524603632 916975758 11263267
RedPrediction = predictValidation(ReducedBrick, model=ReducedLM, na.rm=TRUE,
    filename="data/ReducedPredictionRaster.grd", overwrite=TRUE, range=c(0, +Inf),
    truthlayer="VCF", plothist=TRUE, plotcomparison=TRUE)

names(RedPrediction) = "Predicted.LM.Blue.NIR"
RMSE(getValues(ReducedBrick[["VCF"]]), getValues(RedPrediction))
## [1] 14.69918
zonestatsR = StratifiedRMSE(DataBrick[["VCF"]], RedPrediction, trainingRaster,
    zonenames=levels(trainingPoly@data$Class))
zonestatsR
##          zone     mean
## cropland    1 156.7424
## forest      2  63.1550
## wetland     3 154.6216
plot(raster("data/Predicted.LM.Blue.NIR.grd"))